Topic Modeling on NIPS Dataset Using Gaussian LDA w/ Word-Embeddings


In [86]:
import numpy as np
import os
from operator import itemgetter
from collections import Counter
import scipy.stats as stat
from gensim.models import Word2Vec
from nltk import corpus
import FastGaussianLDA2

Loading the word_vector model with GenSim


In [97]:
wvmodel = Word2Vec.load_word2vec_format(
    "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)
print "word-vector dimension: {}".format(wvmodel.vector_size())


WARNING:gensim.models.word2vec:consider setting layer size to a multiple of 4 for greater performance

Sets of vocab to filter on: NLTK StopWords and Glove vocab


In [108]:
wv_vocab = set(wvmodel.vocab.keys())
stops = set(corpus.stopwords.words(fileids="english"))

Document cleaning

  • Tokenizing just on spaces
  • no lemmatization or stemming
  • removing non-ascci characters
  • removing stop words
  • removing words not in Glove vocab
  • removing non-alpha words (e.g. Letter and symbols)
  • removing short words < 2 characters long
  • lowercasing all words

In [109]:
corpus = []
nips_path = "/Users/michael/Documents/GaussianLDA/data/"
for folder in os.listdir(nips_path)[1:]:
    for doc in os.listdir(nips_path + folder):
        with open(nips_path + folder + "/" + doc, 'r') as f:
            txt = f.read().split()
            txt = map(lambda x: x.lower(), txt)  # Lowercasing each word
            txt = filter(lambda word: [letter for letter in word if ord(letter) < 128 ], txt)  # Checking each word for ascci error
            txt = filter(lambda x: x not in stops, txt)  # Removing stop words
            txt = filter(lambda x: x.isalpha(), txt)  # Removing non-letter words (eg numbers and symbols)
            txt = filter(lambda x: len(x) > 2, txt)  # removing super short words and single letters
            txt = filter(lambda x: x in wv_vocab, txt)    
            txt = ' '.join(txt)
            corpus.append(txt)

In [ ]:
print "Number of documents in corpus: {}".format(len(corpus))

In [ ]:
reload(FastGaussianLDA2)
topics = 50
dim = 50
run_num = 1
outputfile = "/Users/michael/Documents/GaussianLDA/output/NIPS_{}_{}T_{}D_".format(str(run_num),
                                                                                  str(topics), 
                                                                                  str(dim))
lda = FastGaussianLDA2.Gauss_LDA(topics, corpus, word_vector_model=wvmodel, alpha=.5, outputfile=outputfile)
lda.fit(50)  # Number of samples to run


Done processing corpus with 111 documents
There are 9007 words that could be converted to word vectors in your corpus 
There are 0 words that could NOT be converted to word vectors
getting cluster centroids
[   425.   2899.    465.   1933.    560.   3618.   1019.    840.    752.
   4657.    413.   1823.    818.   2918.   2437.    626.   2464.   2074.
   5779.    579.   2841.    419.    451.    193.   4595.   2902.   2167.
   1466.    537.   6669.   1570.   2227.    720.   1901.    541.    934.
    494.   3562.    728.  10024.   9659.   3327.    446.   5769.   2181.
   3879.   3630.    732.    411.   4745.]
Initialization complete
Starting fit
print topic means
TOPIC 0: (u'lurching', u'wobbling', u'awkwardly', u'veering', u'momentarily', u'nimbly', u'gingerly', u'stealthily', u'yanking')
TOPIC 1: (u'consider', u'decision', u'considering', u'whether', u'accept', u'step', u'possibility', u'should', u'would')
TOPIC 2: (u'maximally', u'describable', u'axiomatically', u'biochemically', u'redundantly', u'advantageously', u'deterministically', u'contextualised', u'cognisant')
TOPIC 3: (u'text', u'reference', u'explaining', u'describing', u'read', u'articles', u'description', u'detailed', u'publish')
TOPIC 4: (u'trade-offs', u'sub-categories', u'inter-related', u'morphologies', u'derivations', u'subdomains', u'continuities', u'above-mentioned', u'polarities')
TOPIC 5: (u'the', u'part', u'also', u'as', u'country', u'both', u'member', u'in', u'united')
TOPIC 6: (u'moore', u'smith', u'collins', u'allen', u'anderson', u'freeman', u'thompson', u'walker', u'clark')
TOPIC 7: (u'infection', u'brain', u'inflammation', u'respiratory', u'disease', u'stomach', u'symptoms', u'ailments', u'treating')
TOPIC 8: (u'fine-tuning', u'easiness', u'pluralization', u'materialisation', u'spacial', u'signification', u'break-down', u'linearity', u'quantifying')
TOPIC 9: (u'many', u'other', u'are', u'mostly', u'number', u'some', u'most', u'mainly', u'several')
TOPIC 10: (u'souza', u'caballero', u'carvalho', u'osorio', u'almeida', u'barreto', u'coutinho', u'aranda', u'figueiredo')
TOPIC 11: (u'applies', u'permitting', u'requirement', u'apply', u'requiring', u'rules', u'requirements', u'provisions', u'applicable')
TOPIC 12: (u'cat', u'snake', u'yellow', u'dog', u'monkey', u'red', u'tree', u'pink', u'blue')
TOPIC 13: (u'evidence', u'investigators', u'investigation', u'examining', u'investigating', u'case', u'involved', u'revealed', u'investigated')
TOPIC 14: (u'studies', u'science', u'biology', u'psychology', u'studying', u'sciences', u'graduate', u'harvard', u'mathematics')
TOPIC 15: (u'ferina', u'jiwamol', u'kanoksilp', u'monechma', u'mangxamba', u'thongrung', u'el1l', u'pakad', u'rw96')
TOPIC 16: (u'taken', u'fire', u'carry', u'carried', u'ground', u'carrying', u'leaving', u'inside', u'immediately')
TOPIC 17: (u'sense', u'necessity', u'demonstrates', u'importance', u'determination', u'fundamental', u'genuine', u'respect', u'regard')
TOPIC 18: (u'finite', u'formula_1', u'discrete', u'linear', u'formula_3', u'inverse', u'formula_2', u'symmetric', u'formula_4')
TOPIC 19: (u'individual-level', u'affixation', u'caustics', u'foregrounded', u'technobabble', u'nonlinearities', u'm.sambucetti', u'sreerema', u'r.winters')
TOPIC 20: (u'shows', u'featured', u'feature', u'show', u'musical', u'music', u'features', u'featuring', u'original')
TOPIC 21: (u'subtle', u'expressive', u'idiosyncratic', u'playful', u'evocative', u'whimsical', u'startlingly', u'strikingly', u'wonderfully')
TOPIC 22: (u'honeywell', u'motorola', u'packard', u'xerox', u'lucent', u'3com', u'alliedsignal', u'emc', u'raytheon')
TOPIC 23: (u'chung', u'yu', u'ching', u'ying', u'yang', u'yong', u'lin', u'yi', u'wu')
TOPIC 24: (u'something', u'sure', u'how', u'so', u'what', u'always', u'really', u'anything', u'everyone')
TOPIC 25: (u'built-in', u'circuitry', u'utilizes', u'devices', u'interface', u'sensor', u'optimized', u'analog', u'utilizing')
TOPIC 26: (u'cost', u'payments', u'pay', u'cash', u'costs', u'funds', u'amount', u'money', u'benefits')
TOPIC 27: (u'horizontal', u'curved', u'rectangular', u'shaped', u'vertical', u'circular', u'cylindrical', u'surfaces', u'rows')
TOPIC 28: (u'hutagalung', u'bentivegna', u'chambless', u'quilici', u'vanags', u'pirkko', u'zety', u'frohlich', u'wilbourn')
TOPIC 29: (u'context', u'characteristics', u'concepts', u'aspects', u'examples', u'particular', u'forms', u'specific', u'distinct')
TOPIC 30: (u'excitation', u'amplitude', u'attenuation', u'oscillating', u'oscillations', u'electromagnetic', u'vibration', u'modulation', u'voltage')
TOPIC 31: (u'cause', u'persistent', u'impact', u'inevitably', u'caused', u'resulting', u'serious', u'avoid', u'consequence')
TOPIC 32: (u'sufficiently', u'inherently', u'problematic', u'incapable', u'overly', u'fairly', u'reasonably', u'socially', u'morally')
TOPIC 33: (u'proteins', u'molecules', u'inhibits', u'receptors', u'molecule', u'receptor', u'inhibition', u'protein', u'differentiation')
TOPIC 34: (u'richmond', u'carolina', u'michigan', u'maryland', u'ohio', u'virginia', u'connecticut', u'indiana', u'illinois')
TOPIC 35: (u'decoding', u'retrieval', u'templates', u'databases', u'synchronization', u'encrypting', u'template', u'query', u'typing')
TOPIC 36: (u'bsr', u'afic', u'qms', u'efm', u'dpg', u'cnu', u'oaa', u'gmi', u'lna')
TOPIC 37: (u'straight', u'third', u'second', u'break', u'behind', u'fourth', u'back', u'half', u'put')
TOPIC 38: (u'liquid', u'ingredients', u'mixture', u'milk', u'raw', u'butter', u'sugar', u'dried', u'fruit')
TOPIC 39: (u'systems', u'use', u'using', u'commercial', u'system', u'operating', u'technology', u'uses', u'equipment')
TOPIC 40: (u'fact', u'rather', u'though', u'yet', u'indeed', u'particular', u'very', u'this', u'although')
TOPIC 41: (u'rise', u'decline', u'higher', u'expectations', u'rate', u'growth', u'gains', u'drop', u'prices')
TOPIC 42: (u'lohm', u'iz', u'za', u'yekt', u'ber', u'yev', u'tz', u'buhs', u'noss')
TOPIC 43: (u'on', u'before', u'came', u'since', u'after', u'days', u'months', u'during', u'followed')
TOPIC 44: (u'stretches', u'area', u'along', u'adjacent', u'across', u'portion', u'crossing', u'northeast', u'east')
TOPIC 45: (u'improve', u'aim', u'focus', u'ensure', u'develop', u'enhance', u'necessary', u'creating', u'establish')
TOPIC 46: (u'cerebellum', u'brainstem', u'neurons', u'cortical', u'ganglia', u'cortex', u'thalamus', u'vestibular', u'axons')
TOPIC 47: (u'infer', u'imply', u'logically', u'implication', u'insofar', u'characterize', u'plausible', u'ascribe', u'discern')
TOPIC 48: (u'winkler', u'vogel', u'cleary', u'hauser', u'mendelsohn', u'pickard', u'weinberg', u'gottlieb', u'metzger')
TOPIC 49: (u'measured', u'depending', u'decreases', u'probability', u'ratio', u'varies', u'exceeds', u'decreasing', u'increases')


Document-Topic Counts:, [   425.   2899.    465.   1933.    560.   3618.   1019.    840.    752.
   4657.    413.   1823.    818.   2918.   2437.    626.   2464.   2074.
   5779.    579.   2841.    419.    451.    193.   4595.   2902.   2167.
   1466.    537.   6669.   1570.   2227.    720.   1901.    541.    934.
    494.   3562.    728.  10024.   9659.   3327.    446.   5769.   2181.
   3879.   3630.    732.    411.   4745.]
0 docs sampled
20 docs sampled
40 docs sampled

In [ ]:


In [ ]: